setwd("/home/jar/zxjar/paper_helper/userdata/0/122/")

library(haven) #用于读取xpt
library(plyr) #用于数据处理包
library(dplyr) #用于数据处理包
library(arsenal) ##用于tableby
library(survey) #用于加权分析
library(stringr) #用于字符处理
library(gtsummary)
library(missRanger)
library(caret)
library(randomForest)
library(sjPlot)
library(forestplot)



#setwd('F:/Rproject/r-language/paper1Somke/')

#agerange317dropna <- read.csv('F:/Rproject/r-language/paper1Somke//source.csv')

agerange317dropna <- read.csv('/home/jar/zxjar/paper_helper/userdata/0/122/source.csv')

agerange317dropna$allGroup <-as.factor(agerange317dropna$allGroup)
tbl_summary(agerange317dropna,
missing = 'no',
# 通过属性分组
by = allGroup,
# 进行设置显示维度
include = c(RIDAGEYR,Sex,race,FPL,DMDHHSIZ,childhouseholdmembers,familyhomeownershipstatus,HOD050),
# 批量给变量起别名
label = list(RIDAGEYR ~ "child age, M (SE)",
Sex ~ "child sex",
race ~ "child race/ethnicity",
FPL ~ "FPL",
DMDHHSIZ ~ "no. household members, M (SE)",
childhouseholdmembers ~ "no. child household members",
familyhomeownershipstatus ~ "family home ownership status",
HOD050 ~ "no. household rooms, M (SE)"
),

statistic = list(
# 分别对应数值型和分类变量 后是需要显示表达式
all_continuous() ~ "{mean}",
all_categorical() ~ "{n} ({p} ) "
),
digits = all_continuous() ~ 3
)%>%
#add_n() %>% # 添加非NA观测值个数
# 这里p值不能用p 根据原文说的是加权得来的 需要用 tbl_svysummay进行 这里除了p值 其他是正确的
# add_p() %>% # 添加P值
add_overall() %>%
#add_significance_stars(hide_se = FALSE,hide_p = TRUE)%>%
#modify_header(std.error = "**SE**")
add_ci() %>%
modify_column_merge(pattern = "{stat_0} ({ci_stat_0})")%>%
modify_column_merge(pattern = "{stat_1} ({ci_stat_1})")%>%
modify_column_merge(pattern = "{stat_2} ({ci_stat_2})")%>%
modify_column_merge(pattern = "{stat_3} ({ci_stat_3})")%>%
#modify_spanning_header(c("stat_2", "stat_3","stat_4") ~ "**TSE group**")%>%
modify_header(label = "**characteristic**",all_stat_cols() ~ "**{level}**<br>N = {n} <br> n(% [95% CI]")%>%as_flex_table()%>%flextable::save_as_html(path ='Table1_Result.html')



# show_header_names(tblchild1)
# 表格2 和上面一样 暂时不知道怎么加95%
# 进行复现表格2 部分数据进行转换
# number of household smokers -> smokersize


tbl_summary(agerange317dropna,
# 通过属性分组
by = allGroup,
# 进行设置显示维度
include = c(smokersize,cartse,otherhomeTSE,restaurantTSE,otherindoorareaTSE),
# 批量给变量起别名
label = list(smokersize ~ "number of household smokers",
cartse ~ "car TSE",
otherhomeTSE ~ "other home TSE",
restaurantTSE ~ "restaurant TSE",
otherindoorareaTSE ~ "other indoor area TSE"

),

statistic = list(
# 分别对应数值型和分类变量 后是需要显示表达式
all_continuous() ~ "{mean}±{sd}",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 3
)%>%
#add_n() %>% # 添加非NA观测值个数
# 这里p值不能用p 根据原文说的是加权得来的 需要用 tbl_svysummay进行 这里除了p值 其他是正确的
# add_p() %>% # 添加P值
add_overall() %>%
add_ci(pattern = "{stat} ({ci})") %>%
#modify_spanning_header(c("stat_2", "stat_3","stat_4") ~ "**TSE group**")%>%
modify_header(label = "**TSE variable**")%>%as_flex_table()%>%flextable::save_as_html(path ='Table2_Result.html')

colnames(agerange317dropna)
#urinary TNE2
agerange317dropna$urinaryTNE2 <- (agerange317dropna$URXCOTT /176.2151) +(agerange317dropna$URXHCTT /192.2145  )
#NNAL/TNE2
agerange317dropna$NNAL_TNE2 <- (agerange317dropna$URXNAL /agerange317dropna$urinaryTNE2)
#2-hydroxyfluorene (ng/L)
agerange317dropna$twohydroxyfluorene<- agerange317dropna$URXP04
#3-hydroxyfluorene (ng/L)
agerange317dropna$threehydroxyfluorene<- agerange317dropna$URXP03
#2-hydroxyfluorene/TNE2
agerange317dropna$twohydroxyfluorene_tne2<- agerange317dropna$twohydroxyfluorene /agerange317dropna$urinaryTNE2
#3-hydroxyfluorene/TNE2
agerange317dropna$threehydroxyfluorene_tne2<- agerange317dropna$threehydroxyfluorene /agerange317dropna$urinaryTNE2
# 2CyEMA (ng/mL)
agerange317dropna$twoCyEMA<- agerange317dropna$URXCYM
# 2CyEMA/TNE2
agerange317dropna$twoCyEMA_TNE2<- agerange317dropna$twoCyEMA /agerange317dropna$urinaryTNE2
colnames(agerange317dropna)

# 血清可替宁->lbxcot
# 羟可替宁，血清 (ng/mL)->  lbxhct
# 总羟基可替宁，尿液 (ng/mL) ->  urxhctt
# 总可替宁，尿液 (ng/mL) ->  urxcott
# NNAL，尿液 (ng/mL) -> urxnal --tsna_h.xpt
# 2-羟基芴 (ng/L) -> urxp04
# 3-羟基芴 (ng/L) -> urxp03
# N-乙酰基-S-(2-氰乙基)-L-半胱氨酸(ng/mL) -> urxcym --  2CyEMA
tbl_summary(agerange317dropna,
missing = 'no',
# 通过属性分组
by = allGroup,
# 进行设置显示维度
include = c(LBXCOT,LBXHCT,urinaryTNE2,URXNAL,NNAL_TNE2,twohydroxyfluorene,threehydroxyfluorene,twohydroxyfluorene_tne2,threehydroxyfluorene_tne2,twoCyEMA,twoCyEMA_TNE2),
# 批量给变量起别名
label = list(
LBXCOT ~ "serum cotinine (ng/mL)",
LBXHCT ~ "serum hydroxycotinine (ng/mL)",
urinaryTNE2 ~ "urinary TNE2 (nmol/mL)",
URXNAL ~ "NNAL (pg/mL)",
NNAL_TNE2 ~ "NNAL/TNE2",
twohydroxyfluorene ~ "2-hydroxyfluorene (ng/L)",
threehydroxyfluorene ~ "3-hydroxyfluorene (ng/L)",
twohydroxyfluorene_tne2 ~ "2-hydroxyfluorene/TNE2",
threehydroxyfluorene_tne2 ~ "3-hydroxyfluorene/TNE2",
twoCyEMA ~ "2CyEMA (ng/mL)",
twoCyEMA_TNE2 ~ "2CyEMA/TNE2"),
statistic = list(
# 分别对应数值型和分类变量 后是需要显示表达式
all_continuous() ~ "{mean}"
# all_categorical() ~ "{n} ({p}) "
),
digits = all_continuous() ~ 2
)%>%
add_n(statistic = "{n}({p})") %>% # 添加非NA观测值个数
add_overall() %>%
add_ci()%>%
modify_column_merge(pattern = "{stat_1} ({ci_stat_1})")%>%
modify_column_merge(pattern = "{stat_2} ({ci_stat_2})")%>%
modify_column_merge(pattern = "{stat_3} ({ci_stat_3})")%>%
modify_spanning_header(c("stat_1", "stat_2","stat_3") ~ "**TSE group membership**")%>%
modify_header(label = "**biomarker variable**",all_stat_cols() ~ "**{level}**<br>GeoM (95% CI) ")%>%as_flex_table()%>%flextable::save_as_html(path ='Table3_Result.html')


#
# show_header_names(tblchild3)
# tblchild3
#agerange317dropna <- missRanger(agerange317dropna, pmm.k = 3, num.trees = 100)
#colnames(agerange317dropna)


# 主要表格4  随机森林模型
# 还需要学习gini impurity 基尼系数
# 16个重要变量进行测试
#  number of house smoker -> smokersize
#  serum cotinine ->LBXCOT
#  serum Hydroxycotinine ->LBXHCT
#  urinary nnal -> URXNAL
#  3-hydroxyfluorene/TNE2  -> threehydroxyfluorene_tne2
#  urinary TNE2 -> urinaryTNE2
#  2-hydroxyfluorene/TNE2  -> twohydroxyfluorene_tne2
#  2CyEMA/TNE2 -> twoCyEMA_TNE2
#  NNAL/TNE2  ->NNAL_TNE2
#  2-hydroxyfluorene -> twohydroxyfluorene
#  car tse ->cartse
#  3-hydroxyfluorene -> threehydroxyfluorene
#  2CyEMA  -> twoCyEMA
#  other home tse -> otherhomeTSE
#  otherindoorareaTSE -> otherindoorareaTSE
#  restaurantTSE -> restaurantTSE

colnames(agerange317dropna)
finalModel <- agerange317dropna[,c( 'smokersize','LBXCOT','LBXHCT','URXNAL','threehydroxyfluorene_tne2','urinaryTNE2',
'twohydroxyfluorene_tne2','twoCyEMA_TNE2','NNAL_TNE2','twohydroxyfluorene','cartse','threehydroxyfluorene',
'twoCyEMA','otherhomeTSE','otherindoorareaTSE','restaurantTSE','allGroup')]

# 文章说的提取
finalModel <- missRanger(finalModel, pmm.k = 2, num.trees = 500)
# table(finalModel$allGroup)
# dim(finalModel)
# 根据数据进行分析 拆分数据 80 训练 20 验证
# set.seed(40)
# trains <- createDataPartition(y = finalModel$allGroup,p = 0.8,list = FALSE)
# trainsdata <- finalModel[trains,]
# testdata <- finalModel[-trains,]
# 这块说的是下面六个模型进行拿着20%的数据进行验证 拿到结果进型六个额外模型验证 直接验证吧 没时间了
# all reported TSE and biomarker variables -> finalModel
# all reported TSE variables -> finalModel1
# reported TSE variables excluding number of household smokers -> finalMode2
# 所有的生物标志物排除了比值的
# all biomarker variables excluding ratios -> finalModel3
# biomarker and biomarker ratio variables excluding serum cotinine, serum hydroxycotinine, urinary TNE2, and urinary NNAL -> finalModel4
# all biomarker and biomarker ratio variables -> finalModel5
# all biomarker ratio variables -> finalModel6
# 首先生成这六额外个模型

finalModel1 <- finalModel[,c('otherhomeTSE','otherindoorareaTSE','restaurantTSE','cartse','smokersize','allGroup')]
finalModel2 <- finalModel[,c('otherhomeTSE','otherindoorareaTSE','restaurantTSE','cartse','allGroup')]
finalModel3 <- finalModel[,c('LBXCOT','LBXHCT','urinaryTNE2','URXNAL','twohydroxyfluorene','threehydroxyfluorene', 'twoCyEMA', 'allGroup')]
finalModel4 <- finalModel[,c('twohydroxyfluorene','threehydroxyfluorene', 'twoCyEMA','NNAL_TNE2','threehydroxyfluorene_tne2','twohydroxyfluorene_tne2','twoCyEMA_TNE2', 'allGroup')]
finalModel5 <- finalModel[,c('LBXCOT','LBXHCT','urinaryTNE2','URXNAL','twohydroxyfluorene','threehydroxyfluorene', 'twoCyEMA','NNAL_TNE2','threehydroxyfluorene_tne2','twohydroxyfluorene_tne2','twoCyEMA_TNE2', 'allGroup')]
finalModel6 <- finalModel[,c('NNAL_TNE2','threehydroxyfluorene_tne2','twohydroxyfluorene_tne2','twoCyEMA_TNE2', 'allGroup')]

set.seed(40)
finalModeltreeResult <- randomForest(allGroup ~ .,
data = finalModel,
ntree = 500,
mtry = 16, # 自变量最大数
importance = T)
# 下面每个预测
set.seed(40)
finalModeltreeResult1 <- randomForest(allGroup ~ .,
data = finalModel1,
ntree = 500,
mtry = 5, # 自变量最大数
importance = T)
set.seed(40)
finalModeltreeResult2 <- randomForest(allGroup ~ .,
data = finalModel2,
ntree = 500,
mtry = 4, # 自变量最大数
importance = T)
set.seed(40)
finalModeltreeResult3 <- randomForest(allGroup ~ .,
data = finalModel3,
ntree = 500,
mtry = 7, # 自变量最大数
importance = T)
set.seed(40)
finalModeltreeResult4 <- randomForest(allGroup ~ .,
data = finalModel4,
ntree = 500,
mtry = 7, # 自变量最大数
importance = T)
set.seed(40)
finalModeltreeResult5 <- randomForest(allGroup ~ .,
data = finalModel5,
ntree = 500,
mtry = 11, # 自变量最大数
importance = T)
set.seed(40)
finalModeltreeResult6 <- randomForest(allGroup ~ .,
data = finalModel6,
ntree = 500,
mtry = 4, # 自变量最大数
importance = T)

#可以认为直接拿着  OOB estimate of  error  当做overall 剩下那几个class error 当做误差 也就是 准确度进行计算
board <- read.csv('/home/jar/zxjar/paper_helper/userdata/0/85/bootBoard.csv')

#varImpPlot(finalModeltreeResult,main = "variable Importance plot",type = 2)%>%flextable::save_as_image(path = 'aaa.png')



# 进行拼接表格数据 2023年5月23日13:44:21
conf <- finalModeltreeResult$confusion[,-ncol(finalModeltreeResult$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult$mtry
as <-as.data.frame(finalModeltreeResult$confusion)
board[1,2] <-numva
board[1,3] <-oveall
board[1,4] <-(1-as['NEG','class.error'])*100
board[1,5] <-(1-as['TEG','class.error'])*100
board[1,6] <-(1-as['MEG','class.error'])*100
# 完成一个
conf <- finalModeltreeResult1$confusion[,-ncol(finalModeltreeResult1$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult1$mtry
as <-as.data.frame(finalModeltreeResult1$confusion)
board[2,2] <-numva
board[2,3] <-oveall
board[2,4] <-(1-as['NEG','class.error'])*100
board[2,5] <-(1-as['TEG','class.error'])*100
board[2,6] <-(1-as['MEG','class.error'])*100
# 完成2个
conf <- finalModeltreeResult2$confusion[,-ncol(finalModeltreeResult2$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult2$mtry
as <-as.data.frame(finalModeltreeResult2$confusion)
board[3,2] <-numva
board[3,3] <-oveall
board[3,4] <-(1-as['NEG','class.error'])*100
board[3,5] <-(1-as['TEG','class.error'])*100
board[3,6] <-(1-as['MEG','class.error'])*100
# 完成3个
conf <- finalModeltreeResult3$confusion[,-ncol(finalModeltreeResult3$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult3$mtry
as <-as.data.frame(finalModeltreeResult3$confusion)
board[4,2] <-numva
board[4,3] <-oveall
board[4,4] <-(1-as['NEG','class.error'])*100
board[4,5] <-(1-as['TEG','class.error'])*100
board[4,6] <-(1-as['MEG','class.error'])*100
# 完成4个
conf <- finalModeltreeResult4$confusion[,-ncol(finalModeltreeResult4$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult4$mtry
as <-as.data.frame(finalModeltreeResult4$confusion)
board[5,2] <-numva
board[5,3] <-oveall
board[5,4] <-(1-as['NEG','class.error'])*100
board[5,5] <-(1-as['TEG','class.error'])*100
board[5,6] <-(1-as['MEG','class.error'])*100
# 完成5个
conf <- finalModeltreeResult5$confusion[,-ncol(finalModeltreeResult5$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult5$mtry
as <-as.data.frame(finalModeltreeResult5$confusion)
board[6,2] <-numva
board[6,3] <-oveall
board[6,4] <-(1-as['NEG','class.error'])*100
board[6,5] <-(1-as['TEG','class.error'])*100
board[6,6] <-(1-as['MEG','class.error'])*100
# 完成6个
conf <- finalModeltreeResult6$confusion[,-ncol(finalModeltreeResult6$confusion)]
oob <-  round((sum(diag(conf))/sum(conf)),2)
oveall <-  oob * 100
numva<-finalModeltreeResult6$mtry
as <-as.data.frame(finalModeltreeResult6$confusion)
board[7,2] <-numva
board[7,3] <-oveall
board[7,4] <-(1-as['NEG','class.error'])*100
board[7,5] <-(1-as['TEG','class.error'])*100
board[7,6] <-(1-as['MEG','class.error'])*100
write.csv(board,'/home/jar/zxjar/paper_helper/userdata/0/85/boardOut.csv')
# 完成7个
# tab 4表格完成
# write2html(board,file = 'test.html')
# tbl_summary(board)%>%flextable::save_as_html(path = 'ddddd.html')
# colnames(board)
# htmltools::save_html(file = 'sss.html',board)
# tbl_summary(board)%>%flextable::save_as_html(path = 'Table3_Result.html')
# tb4%>%flextable::save_as_html(path = 'Table3_Result.html')
#importance(agerange317dropnatreeResult)

#table5
#plot(agerange317dropnatreeResult)
# varImpPlot(agerange317dropnatreeResult)
# varImpPlot(agerange317dropnatreeResult,main = "variable Importance score" ,type= 2,sort =T )
# partialPlot(x =agerange317dropnatreeResult,pred.data = finalModel,x.var = LBXCOT )
# plot(finalModel~LBXCOT,data = finalModel)
# prop.table(table(finalModel$smokersize,finalModel$allGroup),margin = 1)

# 随机森林重要变量保存
# 解决网站 为了保存重要变量生成图片地址 https://data-flair.training/blogs/random-forest-in-r/
important <- importance(finalModeltreeResult, type=2 )
Important_Features <- data.frame(Feature = row.names(important), Importance = important[, 1])

plot_<- ggplot(Important_Features,
aes(x= reorder(Feature,Importance) , y = Importance) ) +
geom_bar(stat = "identity",
fill = "#800080") +
coord_flip() +
theme_light(base_size = 20) +
xlab("Questionnaire and Biomarker Variables") +
ylab("variable Importance score")+
ggtitle("variable Importance score") +
theme(plot.title = element_text(size=18))

ggsave("important_features.tiff", plot_)

# 最后一个表格复现  只是区分MEG 与 TEG

finalModel5 <- agerange317dropna[,c( 'smokersize','LBXCOT','LBXHCT','URXNAL','allGroup')]
colnames(finalModel5)

colnames(finalModel5)[1:4] <- c('smokersize','SerumCotinine','serumHydroxycotinine','UrinaryNNAL')

finalModel5$allGroupint <- ifelse(finalModel5$allGroup=='MEG',1,0)
finalModel5 <- missRanger(finalModel5, pmm.k = 2, num.trees = 500)
finalModel5 <- finalModel5[which(finalModel5$allGroup!='NEG'),]
finalModel5 <- finalModel5[which(finalModel5$smokersize!='0 smokers'),]
# table(finalModel5$allGroup)
# finalModel5[which(finalModel5$allGroup!='NEG'),]
# dim(finalModel5)
# dim(finalModel5[which(!is.na(finalModel5$allGroup)),])
# dim(finalModel5[which(finalModel5$allGroup!='MEG'),]) + dim(finalModel5[which(finalModel5$allGroup!='TEG'),])

# aaa <- finalModel5[which(finalModel5$allGroup!='NEG'),]


# smokersizes <- recode_factor(finalModel5$smokersize,
#                             '0 smokers' = 0,
#                             `1 smokers` = 1,
#                             `≥2 smokers` = 2
# )
# finalModel5$smokersize <-smokersizes
# finalModel5$smokersize <-as.numeric(smokersizes)

# smokersize','SerumCotinine','serumHydroxycotinine','UrinaryNNAL
m1 <- glm(
allGroupint ~ SerumCotinine + serumHydroxycotinine+UrinaryNNAL+smokersize,
data = finalModel5,
family = binomial(link = 'logit')

)

reg <- tbl_regression(m1,exponentiate =T )
reg
fit.result<-summary(m1)
df1<-fit.result$coefficients
df2<-confint(m1)
df3<-cbind(df1,df2)
df4<-data.frame(df3[-1,c(1,4,5,6)])
df4$Var<-rownames(df4)
colnames(df4)<-c("OR","Pvalue","OR_1","OR_2","Var")
df5<-df4[,c(5,1,2,3,4)]
df5$OR_mean<-df5$OR
df5$OR<-paste0(round(df5$OR,2),
"(",
round(df5$OR_1,2),
"~",
round(df5$OR_2,2),
")")
df5$Pvalue<-round(df5$Pvalue,3)

Cairo::CairoTIFF(file="table6.tiff", width=8, height=8,units="in",dpi=150)
a<-forestplot(labeltext=as.matrix(df5[,1:3]),
mean=df5$OR_mean,
lower=df5$OR_1,
upper=df5$OR_2,
zero=0.25,
boxsize=0.2,
graph.pos=2)
plot(a)



